Updated: Jun 20, 2020 at 07:23:33 +04.
Working directory: /home/ctl277/Dropbox/ssa_closure.
This script prepares the Census demographics and joins it with annual service areas by spatial interpolation.
Reads:
Writes:
tictoc::tic()
rm(list = ls())
library(dplyr)
library(tidyr)
library(data.table)
library(coler)
library(sf)
Variable info:
var_dict <-
dplyr::tribble(~ variable, ~ label, ~ descrip,
"B00001_001", "pop", "population count",
"B06001_011", "age6574", "65-74",
"B06001_012", "age7485", "74-85",
"B06001_013", "age86p", "86+",
"B99163_004", "english_imp", "ability to speak english in population age 5+ (imputed)",
"B10054_013", "english_lessvw", "speak english less than very well",
"B02001_002", "white", "total white alone",
"B02001_003", "black", "total black alone",
"B06009_003", "educ_hsgrad", "total hs grads",
"B06009_002", "educ_hsless", "total less than hs",
"B07101_002", "nonmovers", "non-movers",
"B06011_001", "income_med", "median income in past 12 months",
"B19313_001", "income", "aggregate income in past 12 months",
"B19057_001", "income_publass", "public assistance income in past 12 months",
"B19055_001", "income_ss", "social security income in past 12 months",
"B16009_001", "poverty", "total poverty status in past 12 months",
"B06012_002", "poverty100m", "total below 100% of poverty level",
"B06012_003", "poverty100150", "total between 100 and 149% of poverty level",
"B10059_001", "poverty_grand", "total poverty status in past 12 months of childrearing grandparents",
################# CENSUS ####################
"P001001", "pop", "population total",
"P006002", "white", "white alone",
"P006003", "black", "black alone",
"P039008", "male65p", "male 65+",
"P039019", "female65p", "female 65+",
"P037011", "educ_hsgrad_male", "male HS grads",
"P037028", "educ_hsgrad_female", "female HS grads",
"P110004", "english_imp", "ability to speak english",
"P087002", "poverty", "total poverty",
"P054001", "income", "aggregate hh income",
"P071001", "income_ss", "SS income",
"P073001", "income_publass", "public assistance income",
"P062002", "indv_ss", "SS count"
)
tract_lng <- readRDS("data/acs/alltracts10-15_lng.rds") %>%
left_join(var_dict, by = "variable")
tract <-
dcast(tract_lng[ , .(year, GEOID, label, estimate)],
year + GEOID ~ label, value.var = "estimate")
tract[ , seniors := age6574 + age7485 + age86p
][ , c("age6574", "age7485", "age86p") := NULL]
ACS 5-year survey (only one with nationwide coverage) didn’t come out until 2010/2011. I need to add the 2000 census data and interpolate for years 2001-2010.
tract00_lng <- readRDS("data/census00_lng.rds") %>%
left_join(var_dict, by = "variable")
tract00 <-
dcast(tract00_lng[ , .(year, GEOID, label, value)],
year + GEOID ~ label, value.var = "value")
tract00[ , seniors := male65p + female65p
][ ,
"educ_hsgrad" := educ_hsgrad_male + educ_hsgrad_female
]
tract00[ ,
c("male65p", "female65p", "educ_hsgrad_male", "educ_hsgrad_female") :=
NULL]
# Preserves column types
dummy <- copy(tract00)
dummy_geo <- dummy$GEOID
dummy[ ] <- NA
dummy[ , GEOID := dummy_geo]
tract_meat <-
map(2001:2009, ~ mutate(dummy, year = as.character(.x))) %>%
rbindlist()
# Find variables that are contained in both Census and ACS data
tract_bread <- rbind(tract00, tract, fill = TRUE)
complete_vars <- intersect(names(tract00), names(tract))
# Sandwich the interpolation dummy rows
tract_full <- bind_rows(tract00[ , ..complete_vars],
tract_meat[ , ..complete_vars],
tract[ , ..complete_vars])
tract_full[ , stfips := str_extract(GEOID, "\\d{2}")]
## Warning in `[.data.table`(tract_full, , `:=`(stfips, str_extract(GEOID, :
## Invalid .internal.selfref detected and fixed by taking a (shallow) copy of the
## data.table so that := can add this new column by reference. At an earlier point,
## this data.table has been copied by R (or was created manually using structure()
## or similar). Avoid names<- and attr<- which in R currently (and oddly) may
## copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?
## setnames and ?setattr. If this message doesn't help, please report your use case
## to the data.table issue tracker so the root cause can be fixed or this message
## improved.
Interpolation between 2000 and 2010 using the baytrends package.
time_interp_vars <- setdiff(complete_vars, c("year", "GEOID"))
setorder(tract_full, GEOID, year)
system.time(
tract_full[ ,
(time_interp_vars) := lapply(.SD, baytrends::fillMissing),
by = GEOID,
.SDcols = time_interp_vars
]
)
## user system elapsed
## 175.532 0.464 175.753
# user system elapsed
# 107.247 1.723 108.675
# user system elapsed
# 186.152 0.496 186.541
# user system elapsed
# 184.304 1.044 185.335
Still have NA values in tract_wide where 2010 tracts did not match to a 2000 tract or vice versa. Tracts are added and removed according to population changes. See documentation here, particularly pages 8-13.
tract_full[is.na(black)]$year %>% unique()
## [1] "2001" "2002" "2003" "2004" "2005" "2006" "2007" "2008" "2009"
lapply(tract_full, prop_na) %>% bind_rows()
## Example of GEOID missing from 2000 census data
# tract00[GEOID == "01001020801"]
## Example of GEOID missing after 2000 census
# tract[GEOID == "04001940100"]
Pivot to wide format to match sf object.
tract_wide <-
pivot_wider(tract_full, id_cols = GEOID, names_from = year,
values_from = setdiff(complete_vars, c("year", "GEOID")))
lapply(tract_wide, prop_na) %>% bind_rows()
tract_wide %>% filter(substr(GEOID, 1, 2) == "04")
Merge with 2017 computer variables for every year. This data set contains the sf object that I need for spatial interpolation below.
tract17 <- readRDS("data/acs/alltracts17.rds") %>%
dplyr::select(GEOID, state,
total_comp = B28001_001E, no_comp = B28001_011E,
total_internet = B28002_013E)
tract_sf <- left_join(x = tract17, y = tract_wide, by = "GEOID") %>%
st_transform(st_crs(3857))
tract_sf$area <- st_area(tract_sf)
For a given state,
Since st_interpolate_aw() lacks an na.rm parameter, we need to replace NA values with something. These are extensive variables, so I’ll replace with 0. See discussion at https://github.com/r-spatial/sf/issues/830.
During interpolation, nafill column should count the tracts in which I’ve replaced some missing value with 0. Remember to check this against num_tracts later.
num_cols <- sapply(tract_sf, is.numeric) %>% which() %>%
names()
tract_sf$num_tracts <- 1
tract_sf$nafill <-
dplyr::select(tract_sf, all_of(num_cols)) %>% st_drop_geometry() %>%
{!complete.cases(.)} %>% as.numeric()
setnafill(tract_sf, type = "const", fill = 0, cols = num_cols)
tract_sfl <- split(tract_sf, tract_sf$state)
serv_sfl <-
list.files("data/service_areas", full = TRUE) %>%
lapply(readRDS)
names(serv_sfl) <-
map(serv_sfl, "state") %>%
sapply(unique)
interp_fn <- "data/sf_interp/%s.rds"
tract_sfl_select <-
map(tract_sfl,
~ dplyr::select(.x, matches("_20\\d\\d"),
total_comp17 = total_comp,
no_comp17 = no_comp,
total_internet17 = total_internet,
area, num_tracts, nafill))
system.time(
parallel::mclapply(names(tract_sfl_select),
function(st){
# Split service areas by year
interp_yrl <- split(serv_sfl[[st]], serv_sfl[[st]]$year)
# Run interpolation separately for each year
for(yr in as.character(2000:2015)){
# Subset to this year's service area
sa_sf <- filter(serv_sfl[[st]], year == yr)
# Subset to this year's demographics and rename columns
demog_sf <-
dplyr::select(tract_sfl_select[[st]],
matches(yr),
total_comp17,
no_comp17,
total_internet17,
area,
num_tracts,
nafill
) %>%
rename_with(~ gsub(paste0("_", yr), "", .x))
# Interpolate
interp <- st_interpolate_aw(demog_sf, to = sa_sf, extensive = TRUE)
# Re-link with service areas
# Group.1 refers to the row number in `to` (above)
# to which the interpolation corresponds. This fixes
# errors in AZ and FL.
interp2 <-
sa_sf[interp$Group.1, ] %>%
st_drop_geometry() %>%
bind_cols(interp)
# Store in output list
interp_yrl[[yr]] <- interp2
}
out <- bind_rows(interp_yrl)
# return(out)
saveRDS(out, sprintf(interp_fn, st))
}, mc.cores = 5, mc.preschedule = FALSE
)
)
## user system elapsed
## 800.212 50.244 199.361
# user system elapsed
# 793.166 36.290 197.002
# user system elapsed
# 832.362 62.149 207.942
interp_files <- list.files("data/sf_interp", full = TRUE)
interp <- lapply(interp_files, readRDS) %>% bind_rows()
interp <- dplyr::select(interp, state, year, office, everything(), -geometry)
saveRDS(interp, "data/demograph_service.rds")
tictoc::toc()
## 407.302 sec elapsed